{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch11_ttest.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "#### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 18 (bias)\n",
    "\n",
    "---\n",
    "\n",
    "# About this code file:\n",
    "\n",
    "### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.\n",
    "\n",
    "### Solutions to all exercises are at the bottom of the notebook.\n",
    "\n",
    "#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE."
   ],
   "metadata": {
    "id": "mzfXc9E3Xq7k"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "dFT1qwVEyxTW"
   },
   "outputs": [],
   "source": [
    "# import libraries and define global settings\n",
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "BQlwO5QkJv7v"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "YgKc7uVRJv5D"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# generate random data, trim smallest values\n",
    "\n",
    "N = 30\n",
    "\n",
    "# create new data\n",
    "data = np.random.randn(N)\n",
    "\n",
    "# trim\n",
    "idx = np.argsort(data)\n",
    "dataTrim = data[idx[2:]]\n",
    "\n",
    "# ttests\n",
    "ttestO = stats.ttest_1samp(data,0)\n",
    "ttestT = stats.ttest_1samp(dataTrim,0)\n",
    "\n",
    "# report the results\n",
    "print(f'Full: t({ttestO.df}) = {ttestO.statistic:.3f}, p = {ttestO.pvalue:.3f}')\n",
    "print(f'Trim: t({ttestT.df}) = {ttestT.statistic:.3f}, p = {ttestT.pvalue:.3f}')"
   ],
   "metadata": {
    "id": "_scNsnfYAeKr"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# generate random data, trim smallest or extremiest values\n",
    "\n",
    "N = 30\n",
    "numreps = 1000\n",
    "\n",
    "\n",
    "pLessThan05 = np.zeros((numreps,3))\n",
    "tValues = np.zeros((numreps,3))\n",
    "\n",
    "for expi in range(numreps):\n",
    "\n",
    "  # create new data\n",
    "  data = np.random.randn(N)\n",
    "\n",
    "  # trim\n",
    "  idx = np.argsort(data)\n",
    "  dataTrimL = data[idx[2:]]\n",
    "  dataTrimB = data[idx[1:-1]]\n",
    "\n",
    "  # ttests\n",
    "  ttestO = stats.ttest_1samp(data,0)      # O = original\n",
    "  ttestL = stats.ttest_1samp(dataTrimL,0) # L = left side trimmed\n",
    "  ttestB = stats.ttest_1samp(dataTrimB,0) # B = both sides trimmed\n",
    "\n",
    "  # store \"significances\"\n",
    "  pLessThan05[expi,0] = ttestO.pvalue<.05\n",
    "  pLessThan05[expi,1] = ttestL.pvalue<.05\n",
    "  pLessThan05[expi,2] = ttestB.pvalue<.05\n",
    "\n",
    "  # store t-values\n",
    "  tValues[expi,0] = ttestO.statistic\n",
    "  tValues[expi,1] = ttestL.statistic\n",
    "  tValues[expi,2] = ttestB.statistic\n",
    "\n",
    "\n",
    "# report the output\n",
    "print(f'   Without data trimming: {np.sum(pLessThan05[:,0],dtype=int):>3}/{expi+1} with p<.05 ({100*np.mean(pLessThan05[:,0]):>5.2f}%)')\n",
    "print(f' With symmetric trimming: {np.sum(pLessThan05[:,2],dtype=int):>3}/{expi+1} with p<.05 ({100*np.mean(pLessThan05[:,2]):>5.2f}%)')\n",
    "print(f'With asymmetric trimming: {np.sum(pLessThan05[:,1],dtype=int):>3}/{expi+1} with p<.05 ({100*np.mean(pLessThan05[:,1]):>5.2f}%)')"
   ],
   "metadata": {
    "id": "hNd5PIV6AeH-"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# visualize the change in t-values\n",
    "plt.figure(figsize=(5,5))\n",
    "\n",
    "# plot the t-values\n",
    "plt.plot(tValues[:,0],tValues[:,2],'ks',markersize=6,alpha=.8,markerfacecolor=(.5,.5,.5),label='Symmetric trim')\n",
    "plt.plot(tValues[:,0],tValues[:,1],'ko',markersize=6,alpha=.8,markerfacecolor=(.9,.9,.9),label='Asymmetric trim')\n",
    "\n",
    "# plot the unity line\n",
    "extT = np.max(np.abs(tValues))\n",
    "plt.plot([-extT,extT],[-extT,extT],linewidth=2,linestyle='--',color='gray',label='Unity')\n",
    "\n",
    "plt.xticks(range(-4,5,2))\n",
    "plt.yticks(range(-4,5,2))\n",
    "plt.xlabel('Original t-values')\n",
    "plt.ylabel('T-values after trimming')\n",
    "plt.legend()\n",
    "plt.grid()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('bias_ex1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "-QY3hEaNAeFQ"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "U4sedRyVAeCj"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "kMZG0tqIbUoU"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# generate some data in a DataFrame\n",
    "df = pd.DataFrame(np.random.randn(50,10), columns=[f'v{i}' for i in range(10)])\n",
    "\n",
    "# Pearson correlation matrix\n",
    "R = df.corr()\n",
    "\n",
    "# Mask the diagonal to ignore r=1\n",
    "np.fill_diagonal(R.values,0)\n",
    "\n",
    "# find indices of max pair\n",
    "xi,yi = np.where(R.abs()==np.max(R.abs().values))[0]\n",
    "\n",
    "# get p-value\n",
    "pval = stats.pearsonr(df.iloc[:,xi],df.iloc[:,yi])[1]\n",
    "\n",
    "\n",
    "# Scatter plot of the variables with the highest correlation\n",
    "plt.figure(figsize=(6,5))\n",
    "sns.scatterplot(data=df, x=df.columns[xi], y=df.columns[yi],\n",
    "                s=100,edgecolor='k',facecolor=(.8,.8,.8),linewidth=2,alpha=.7)\n",
    "plt.title(f'r = {R.iloc[xi,yi]:.2f}, p = {pval:.3f}',loc='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('bias_ex2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "YipkIzGLJvxV"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "### Code below is modified from Exercise 12.11\n",
    "\n",
    "# convert R to dataframes for seaborn\n",
    "R_df = pd.DataFrame(R,columns=df.columns,index=df.columns)\n",
    "\n",
    "# Bonferroni correction [ formula is (M*(M-1))/2 ]\n",
    "num_comparisons = (df.shape[1]*(df.shape[1]-1)) / 2\n",
    "bonferroni_thresh = .05 / num_comparisons\n",
    "\n",
    "\n",
    "# Create a matrix of annotations\n",
    "annot_array = R_df.astype(str).values\n",
    "\n",
    "# loop through all elements of the matrix and create a string to display\n",
    "for i in range(R_df.shape[0]):\n",
    "  for j in range(R_df.shape[1]):\n",
    "\n",
    "    # get the p-value and determine significance\n",
    "    pval = stats.pearsonr(df.iloc[:,i],df.iloc[:,j])[1]\n",
    "    significant = pval < bonferroni_thresh\n",
    "\n",
    "    # the string depends on the significance\n",
    "    if not significant:\n",
    "      # if non-significant, just the correlation coefficient\n",
    "      annot_array[i,j] = f'{R_df.iloc[i, j]:.2f}'\n",
    "    else:\n",
    "      # if significant, add an asterisk to the coefficient\n",
    "      annot_array[i,j] = f'{R_df.iloc[i, j]:.2f}*'\n",
    "\n",
    "    # don't need to report the diagonals\n",
    "    if i==j:\n",
    "      annot_array[i,j] = ''\n",
    "\n",
    "\n",
    "## now show the image\n",
    "plt.figure(figsize=(8,6))\n",
    "sns.heatmap(R_df,annot=annot_array,fmt='s',cmap='coolwarm',vmin=-.4,vmax=.4,\n",
    "            xticklabels=R_df.columns,yticklabels=R_df.columns)\n",
    "\n",
    "plt.title('Correlation matrix (*p<.05 corrected)',loc='center',weight='bold')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "FNhSrXzFcU-m"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "tdFxJZEq8kjA"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}